Machine Learning Approaches for Ammonia Volatilization Prediction after Manure Application

Authors

Armand Favrot

Sophie Génermont

Céline Decuq

David Makowski

library ("readxl") # version 1.4.3
library ("ggplot2") # version 3.4.4
library ("gridExtra") # version 2.3
library ("dplyr") # version 1.1.2      
library ("paletteer") # version 1.4.0
library ("tibble") # version 3.2.1
library ("tidyr") # version 1.3.0
library ("ALFAM2") # version 3.7
library ("randomForest") # version 4.7.1.1
library ("xgboost") # version 1.7.6.1
library ("glmnet") # version 4.1.8
library ("treeshap") # version 0.3.1
library ("shapviz") # version 0.9.3
library ("kableExtra") # version 1.3.4

Data description

load (file = "scripts/processed_data/data_alfam2.Rdata")
data_alfam2 %>% summarise (n = n(), .by = dataset)
             dataset    n
1 Calibration subset 5515
2  Evaluation subset  424
plot_data_description = data_alfam2 %>%

    filter (! (app.mthd == "bsth" & incorp == "shallow")) %>%

    mutate (strategy = paste (app.mthd, incorp, sep = " - ")) %>%
    mutate (strategy = recode (strategy, "bc - none" = "A", "bc - shallow" = "B", "bc - deep" = "C", "bsth - none" = "D", "os - none" = "E", "ts - none" = "F")) %>%

    ggplot () +
        geom_line (aes (x = time, y = e.rel, group = pmid), linewidth = 0.15) +
        ylab ("Cumulative emission (frac. applied TAN)") +
        xlab ("Time (h)") +
        theme (axis.title.y = element_text (margin = ggplot2::margin (r = 25)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
        # scale_color_manual (values = Dark2) +
        facet_wrap (~ strategy)

plot_data_description

data_alfam2 %>%

    select (air.temp, wind.2m, rain.rate, rain.cum) %>%

    summary 
    air.temp        wind.2m           rain.rate         rain.cum     
 Min.   :-1.90   Min.   : 0.05133   Min.   :0.0000   Min.   : 0.000  
 1st Qu.: 9.50   1st Qu.: 1.65665   1st Qu.:0.0000   1st Qu.: 0.000  
 Median :12.50   Median : 2.80000   Median :0.0000   Median : 0.000  
 Mean   :13.28   Mean   : 3.08744   Mean   :0.0406   Mean   : 1.131  
 3rd Qu.:16.85   3rd Qu.: 4.08000   3rd Qu.:0.0000   3rd Qu.: 0.400  
 Max.   :35.24   Max.   :16.78400   Max.   :7.1000   Max.   :55.900  
data_alfam2 %>% 

    filter (time == max (time), .by = pmid) %>%

    select (e.cum, time, tan.app, app.rate, man.dm, man.ph, t.incorp) %>%

    summary 
     e.cum              time          tan.app          app.rate     
 Min.   :  0.264   Min.   :20.75   Min.   : 10.90   Min.   :  6.60  
 1st Qu.:  5.815   1st Qu.:48.16   1st Qu.: 36.66   1st Qu.: 18.73  
 Median : 10.968   Median :70.30   Median : 54.28   Median : 28.40  
 Mean   : 16.162   Mean   :62.13   Mean   : 62.07   Mean   : 29.76  
 3rd Qu.: 21.473   3rd Qu.:71.70   3rd Qu.: 80.30   3rd Qu.: 36.65  
 Max.   :140.570   Max.   :78.00   Max.   :235.40   Max.   :132.60  
                                                                    
     man.dm           man.ph        t.incorp      
 Min.   : 1.000   Min.   :6.40   Min.   : 0.0000  
 1st Qu.: 3.790   1st Qu.:7.10   1st Qu.: 0.0000  
 Median : 6.755   Median :7.40   Median : 0.0833  
 Mean   : 6.248   Mean   :7.42   Mean   : 1.1878  
 3rd Qu.: 8.150   3rd Qu.:7.70   3rd Qu.: 0.0833  
 Max.   :13.600   Max.   :8.50   Max.   :48.0000  
                  NA's   :85     NA's   :475      

Part 1 - Model comparison

ALFAM2

Predictions of emissions using the ALFAM2 model.

# We use alfam2pars01 parameter without pH, like in Hafner et al, 2019
pars <- alfam2pars01[!grepl('man.ph', names(alfam2pars01))]

alfam2_predictions =  alfam2 (
        
        pars = pars,
    
        dat = data_alfam2 %>% select (- j.NH3, - e.cum, - e.rel, - dataset, - country),
        
        app.name = "tan.app",
        
        time.name = "time",
    
        time.incorp = "t.incorp",
        
        group = "pmid",
    
        prep = TRUE,
        
        warn = FALSE
)

We add the true values to the prediction dataframe and we keep only the last observation of each pmid.

alfam2_predictions = alfam2_predictions %>%

        select (pmid, time, e, er) %>%

        mutate (truth_e = data_alfam2$e.cum) %>%
        mutate (truth_er = data_alfam2$e.rel) %>%

        mutate (dataset = data_alfam2$dataset,
                country = data_alfam2$country,
                app.mthd = data_alfam2$app.mthd) %>%
        
        select (pmid, time, e, er, truth_e, truth_er, dataset, country, app.mthd) %>% 

        filter (time == max (time), .by = pmid)

Random forest

Predictions of emissions using random forest.

load (file = "scripts/processed_data/data_random_forest.Rdata")
data_random_forest_calibration = data_random_forest %>% 
    filter (dataset == "Calibration subset") %>%
    select (- pmid, - e.rel, - dataset, - country)

set.seed (123)
random_forest_model = randomForest (
    
        e.cum ~ ., 

        data = data_random_forest_calibration,

        importance = TRUE,
    
        mtry = 19, nodesize = 3, replace = FALSE, sample_frac = 0.8
    
)
plot (random_forest_model)

random_forest_predictions = data_random_forest %>%

    mutate (e.cum_hat = predict ( 
                random_forest_model,                       
                newdata = data_random_forest %>% 
                    select (- pmid, - e.cum, - e.rel, - dataset, - country)
            )
    ) %>%

    mutate (e.rel_hat = e.cum_hat / tan.app) %>%

    mutate (app.mthd = recode (as.character (app.mthd), "1" = "bc", "2" = "ts", "3" = "os", "4" = "bsth")) %>%

    select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd)    

Xgboost

Predictions of emissions using gradient boosting.

load (file = "scripts/processed_data/data_xgboost.Rdata")
set.seed (123)
xgboost_model = xgboost (  

    data = xgb.DMatrix (

        data = data_xgboost %>% 
            filter (dataset == "Calibration subset") %>%
            select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
            as.matrix %>%
            {.}, 

        label = data_xgboost %>% 
            filter (dataset == "Calibration subset") %>%
            select (e.cum) %>% 
            as.matrix %>%
            {.}

    ),

    max.depth = 6, nrounds = 300, eta = 0.3, min_child_weight = 0.5, subsample = 0.8,

    verbose = FALSE,
    
    objective = "reg:squarederror"

)
xgboost_predictions = data_xgboost %>%

    mutate (e.cum_hat = predict (
        
                xgboost_model, 
                newdata = data_xgboost %>% 
                        select (- pmid, - e.cum, - e.rel, - dataset, - country) %>% 
                        as.matrix
            )
    ) %>%

    mutate (e.rel_hat = e.cum_hat / tan.app) %>%

    mutate (app.mthd = recode (as.character (app.mthd), "1" = "bc", "2" = "ts", "3" = "os", "4" = "bsth")) %>%

    select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd)

Lasso

Predictions of emissions using Lasso regression.

load (file = "scripts/processed_data/data_lasso.Rdata")
set.seed (123)
lasso_model = cv.glmnet (  

    x = data_lasso %>% 
        filter (dataset == "Calibration subset") %>%
        select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
        as.matrix %>%
        {.}, 

    y = data_lasso %>% 
        filter (dataset == "Calibration subset") %>%
        select (e.cum) %>% 
        as.matrix %>%
        {.},

    alpha = 1
)
lasso_predictions_vector = predict (
        
    lasso_model, 

    data_lasso %>% 
            select (- pmid, - e.cum, - e.rel, - dataset, - country) %>% 
            as.matrix
)


df_join = data_alfam2 %>% select (pmid, app.mthd) %>% distinct

lasso_predictions = data_lasso %>%

    left_join (df_join, by = "pmid") %>%

    mutate (e.cum_hat = exp (lasso_predictions_vector), e.cum = exp (e.cum)) %>%

    mutate (e.rel_hat = e.cum_hat / tan.app) %>%

    select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd) 

Comparison

predictions_of_all_methods = rbind (
    alfam2_predictions %>% mutate (method = "alfam2"), 
    random_forest_predictions %>% mutate (method = "random forest"),
    xgboost_predictions %>% mutate (method = "xgboost"),
    lasso_predictions %>% mutate (method = "lasso")
)
predictions_of_all_methods %>% head %>% mutate_if (is.numeric, round, digits = 1) %>% kable
pmid time e er truth_e truth_er dataset country app.mthd method
182 44.8 21.1 0.2 11.8 0.1 Calibration subset DK bc alfam2
183 45.2 7.3 0.1 5.5 0.1 Calibration subset DK bsth alfam2
184 45.1 14.7 0.2 11.2 0.1 Calibration subset DK bc alfam2
185 45.5 7.9 0.1 7.3 0.1 Calibration subset DK bsth alfam2
186 43.9 14.8 0.3 9.1 0.2 Calibration subset DK bc alfam2
187 45.2 8.5 0.2 6.5 0.1 Calibration subset DK bsth alfam2

Residuals

Comparison of residual for the calibration and evaluation subsets.

plot_residuals_complete_dataset = predictions_of_all_methods %>%
    mutate (residuals = truth_er - er) %>%
    mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hoses", "ts" = "Trailing shoes", "os" = "Open slot injection")) %>%
    mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hoses", "Trailing shoes", "Open slot injection"))) %>%
    ggplot () +
        geom_boxplot (aes (x = country, y = residuals, fill = method)) +
        geom_hline (yintercept = 0, linetype = 2) +
        scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
        facet_wrap (~ app.mthd, ncol = 1) +
        ylab ("Residual (frac. applied TAN)") +
        labs (fill = "") +
        theme (legend.position = "bottom",
               axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))

plot_residuals_complete_dataset

Comparison of residual for the evaluation subset.

plot_residuals_evaluation_subset = predictions_of_all_methods %>%
    filter (dataset == "Evaluation subset") %>%
    mutate (residuals = truth_er - er) %>%
    mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hoses", "ts" = "Trailing shoes", "os" = "Open slot injection")) %>%
    mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hoses", "Trailing shoes", "Open slot injection"))) %>%
    ggplot () +
        geom_boxplot (aes (x = country, y = residuals, fill = method), varwidth = TRUE) +
        geom_hline (yintercept = 0, linetype = 2) +
        scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
        facet_wrap (~ app.mthd, ncol = 1) +
        ylab ("Residual (frac. applied TAN)") +
        labs (fill = "") +
        theme (legend.position = "bottom",
               axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))

plot_residuals_evaluation_subset

Observed vs predicted values

plot_observed_vs_predicted_values = predictions_of_all_methods %>% 

    filter (dataset == "Evaluation subset") %>%

    mutate (method = recode (method, "alfam2" = "A", "lasso" = "B", "random forest" = "C", "xgboost" = "D")) %>%
    
    ggplot () +

        geom_point (aes (x = truth_e, y = e, color = method), size = 3) +

        geom_abline (slope = 1, linetype = "dashed") +

        facet_wrap (~ method) +

        scale_color_manual (values = Dark2[c(2, 5, 6, 8)]) +

        labs (color = "") +

        theme (legend.position = "none",
               axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
               axis.title.x = element_text (margin = ggplot2::margin (t = 15, b = 20)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +

        xlab ("Observed values (kg/ha)") + ylab ("Predicted values (kg/ha)") +

        NULL

plot_observed_vs_predicted_values

Evaluation metrics

df_evaluation_metrics = rbind (
    
    predictions_of_all_methods %>% 
        select (prediction = e, truth = truth_e, dataset, method) %>% 
        mutate (response = "72h cum. emission"),
    
    predictions_of_all_methods %>% 
        select (prediction = er, truth = truth_er, dataset, method) %>% 
        mutate (response = "72h relative cum. emission")
    
) %>%

    summarise (
      
        MSE = mean ((prediction - truth) ^ 2),
      
        RMSE = sqrt (mean ((prediction - truth) ^ 2)),
        
        Pearsons_r = cor (prediction, truth),

        ME = 1 - (sum ( (prediction - truth) ^ 2) / sum ( (truth - mean (truth)) ^ 2)),

        MAE = mean (abs (prediction - truth)),
        
        MBE = mean (prediction - truth),

        .by = c (dataset, method, response)

) %>% 

    mutate_if (is.numeric, round, digits = 2)

df_evaluation_metrics %>%  arrange (response, dataset) %>% kable ()
dataset method response MSE RMSE Pearsons_r ME MAE MBE
Calibration subset alfam2 72h cum. emission 113.86 10.67 0.82 0.61 6.64 -2.98
Calibration subset random forest 72h cum. emission 12.36 3.52 0.98 0.96 2.21 0.14
Calibration subset xgboost 72h cum. emission 0.54 0.73 1.00 1.00 0.26 0.00
Calibration subset lasso 72h cum. emission 122.02 11.05 0.78 0.58 6.70 -2.56
Evaluation subset alfam2 72h cum. emission 65.29 8.08 0.86 0.62 5.65 -2.81
Evaluation subset random forest 72h cum. emission 20.34 4.51 0.94 0.88 3.28 0.92
Evaluation subset xgboost 72h cum. emission 38.34 6.19 0.90 0.78 4.09 0.51
Evaluation subset lasso 72h cum. emission 53.32 7.30 0.84 0.69 5.57 -1.38
Calibration subset alfam2 72h relative cum. emission 0.03 0.18 0.80 0.53 0.12 -0.06
Calibration subset random forest 72h relative cum. emission 0.00 0.05 0.98 0.96 0.04 0.01
Calibration subset xgboost 72h relative cum. emission 0.00 0.01 1.00 1.00 0.00 0.00
Calibration subset lasso 72h relative cum. emission 0.03 0.16 0.79 0.59 0.11 -0.04
Evaluation subset alfam2 72h relative cum. emission 0.02 0.15 0.79 0.58 0.11 -0.04
Evaluation subset random forest 72h relative cum. emission 0.01 0.10 0.92 0.81 0.08 0.03
Evaluation subset xgboost 72h relative cum. emission 0.01 0.12 0.89 0.75 0.08 0.01
Evaluation subset lasso 72h relative cum. emission 0.02 0.16 0.77 0.55 0.12 0.00
df_evaluation_metrics %>%
  
    filter (response == "72h cum. emission") %>%
  
    select (- MSE, - RMSE, - Pearsons_r, - ME) %>%

    pivot_longer (cols = c (MAE, MBE)) %>%

    arrange (name, dataset) %>%

    mutate (variation = (1 - abs(value) / abs(value [method == "alfam2"])) * 100, .by = c(name, dataset)) %>%
    mutate (variation = round (variation, digits = 1)) %>%
    mutate (value = round (value, digits = 1)) %>%
  
    kable ()
dataset method response name value variation
Calibration subset alfam2 72h cum. emission MAE 6.6 0.0
Calibration subset random forest 72h cum. emission MAE 2.2 66.7
Calibration subset xgboost 72h cum. emission MAE 0.3 96.1
Calibration subset lasso 72h cum. emission MAE 6.7 -0.9
Evaluation subset alfam2 72h cum. emission MAE 5.7 0.0
Evaluation subset random forest 72h cum. emission MAE 3.3 41.9
Evaluation subset xgboost 72h cum. emission MAE 4.1 27.6
Evaluation subset lasso 72h cum. emission MAE 5.6 1.4
Calibration subset alfam2 72h cum. emission MBE -3.0 0.0
Calibration subset random forest 72h cum. emission MBE 0.1 95.3
Calibration subset xgboost 72h cum. emission MBE 0.0 100.0
Calibration subset lasso 72h cum. emission MBE -2.6 14.1
Evaluation subset alfam2 72h cum. emission MBE -2.8 0.0
Evaluation subset random forest 72h cum. emission MBE 0.9 67.3
Evaluation subset xgboost 72h cum. emission MBE 0.5 81.9
Evaluation subset lasso 72h cum. emission MBE -1.4 50.9
# Evaluation on evalutation subset
df_plot = df_evaluation_metrics %>%
    filter (response == "72h cum. emission") %>%
    select (- MSE, - RMSE) %>%
    filter (dataset == "Evaluation subset") %>%
    pivot_longer (cols = c (Pearsons_r, ME, MAE, MBE)) %>%
    mutate (name = factor (name, levels = c("Pearsons_r", "ME", "MAE", "MBE")))

suppressWarnings(ggplot (df_plot) +
    geom_histogram (aes (x = name, y = value, fill = method), position = "dodge", stat = "identity") +
    facet_wrap (~ name, scales = "free", nrow = 1) + 
    xlab ("") + ylab ("") + labs (fill = "") +
    theme (legend.position = "none ", 
           strip.text.y = element_text(angle = 0), 
           axis.text.x = element_blank(), 
           axis.ticks.x = element_blank(),
           strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
    scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
    ggtitle ("Evaluation subset") + theme (plot.title = element_text (margin= ggplot2::margin(0,0,20,0), hjust = 0, size = 30)))

# Evaluation on calibration subset
df_plot = df_evaluation_metrics %>%
    filter (response == "72h cum. emission") %>%
    select (- MSE, - RMSE) %>%
    filter (dataset == "Calibration subset") %>%
    pivot_longer (cols = c (Pearsons_r, ME, MAE, MBE)) %>%
    mutate (name = factor (name, levels = c("Pearsons_r", "ME", "MAE", "MBE")))

suppressWarnings(ggplot (df_plot) +
    geom_histogram (aes (x = name, y = value, fill = method), position = "dodge", stat = "identity") +
    facet_wrap (~ name, scales = "free", nrow = 1) + 
    xlab ("") + ylab ("") + labs (fill = "") +
    theme (legend.position = "bottom", 
           strip.text.y = element_text(angle = 0), 
           axis.text.x = element_blank(), 
           axis.ticks.x = element_blank(),
           strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
    scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
    ggtitle ("Calibration subset") + theme (plot.title = element_text (margin= ggplot2::margin(0,0,20,0), hjust = 0, size = 30)))

Part 2 - New training with the whole dataset

Random forest

set.seed (123)
random_forest_model = randomForest (
    
        e.cum ~ ., 

        data = data_random_forest %>% select (- pmid, - e.rel, - dataset, - country),

        importance = TRUE,
    
        mtry = 19, nodesize = 3, replace = FALSE, sample_frac = 0.8
    
)

Shapley

df_tmp = data_random_forest_calibration %>% select (- e.cum)

unified_model_rf <- treeshap::randomForest.unify (random_forest_model, df_tmp)

treeshap_rf <- treeshap::treeshap (unified_model_rf, df_tmp, verbose = FALSE)

shapley_rf <- shapviz (treeshap_rf, X = data_random_forest_calibration %>% select (- e.cum))
plot_shap_rf = shapviz::sv_importance (shapley_rf, kind = "beeswarm", max_display = 10L) +
    scale_colour_gradient (low = "darkblue", high = "red", breaks = c(0, 1), labels = c("low", "high")) +
    theme (
      legend.title = element_text (size = 25, margin = ggplot2::margin (r = 50, b = 100)),
      legend.position = "bottom",
      legend.key.width = unit(3, "cm"),
      axis.title.x = element_text (margin = ggplot2::margin( b = 30))
    )

plot_shap_rf

Gain

df_importance_rf = randomForest::importance (random_forest_model) %>%
    as.data.frame %>%
    rownames_to_column (var = "Variable") %>%
    mutate (`%IncMSE` = `%IncMSE` / max (`%IncMSE`)) %>%
    mutate (type = "A")

plot_importance_rf = ggplot (df_importance_rf) + 
        geom_point (aes (x = `%IncMSE`, y = Variable), size = 5) +
        scale_y_discrete (limits = df_importance_rf %>% arrange (`%IncMSE`) %>% pull (Variable)) +
        theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
        xlab ("Importance") +
        ylab ("") +
        facet_wrap (~ type)

plot_importance_rf

df_importance_rf %>%
    arrange (desc (`%IncMSE`)) %>%
    mutate_if (is.numeric, round, digits = 2) %>%
    kable ()
Variable %IncMSE IncNodePurity type
app.mthd 1.00 18855.07 A
tan.app 0.61 18809.39 A
man.ph 0.55 13928.94 A
incorp 0.43 6180.64 A
t.incorp 0.33 4605.37 A
man.dm 0.33 5602.95 A
app.rate 0.24 4444.92 A
temp_6 0.23 1241.42 A
rain_6 0.21 1181.81 A
rain_1 0.20 371.33 A
temp_5 0.19 1166.82 A
time 0.19 2469.28 A
temp_1 0.18 1138.24 A
wind_2 0.17 1647.65 A
temp_2 0.17 1546.45 A
wind_1 0.15 2374.54 A
wind_3 0.14 1335.06 A
temp_4 0.14 1038.28 A
rain_2 0.14 254.33 A
rain_5 0.13 118.99 A
wind_6 0.13 1701.91 A
temp_3 0.12 1617.32 A
rain_4 0.11 132.87 A
man.source 0.08 688.20 A
rain_3 0.08 140.28 A
wind_5 0.08 597.15 A
wind_4 0.07 602.99 A

Xgboost

set.seed (123)
xgboost_model = xgboost (  

    data = xgb.DMatrix (

        data = data_xgboost %>% 
            select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
            as.matrix %>%
            {.}, 

        label = data_xgboost %>% 
            select (e.cum) %>% 
            as.matrix %>%
            {.}

    ),

    max.depth = 6, nrounds = 300, eta = 0.3, min_child_weight = 0.5, subsample = 0.8,

    verbose = FALSE,
    
    objective = "reg:squarederror"

)

Shapley

df_tmp = data_xgboost %>% select (- pmid, - e.cum, - e.rel, - dataset, - country)

unified_model_xgboost <- treeshap::xgboost.unify (xgboost_model, as.matrix(df_tmp))

treeshap_xgboost <- treeshap::treeshap (unified_model_xgboost, df_tmp, verbose = FALSE)

shapley_xgb <- shapviz::shapviz (treeshap_xgboost, X = df_tmp)
plot_shap_xgboost = shapviz::sv_importance (shapley_xgb, kind = "beeswarm", max_display = 10L) +
  scale_colour_gradient (low = "darkblue", high = "red", breaks = c(0, 1), labels = c("low", "high")) +
  theme (legend.title = element_text (size = 30, margin = ggplot2::margin (r = 50, b = 100)),
         legend.position = "bottom",
         legend.key.width = unit(3, "cm"),
         axis.title.x = element_text (margin = ggplot2::margin( b = 30)))

plot_shap_xgboost

Gain

df_importance_xgb = xgboost::xgb.importance (model = xgboost_model) %>%
    mutate (Gain = Gain / max (Gain)) %>%
    mutate (type = "B")

plot_importance_xgboost = df_importance_xgb %>%

    ggplot() +
        geom_point (aes (x = Gain, y = Feature), size = 5) +
        scale_y_discrete (limits = df_importance_xgb %>% arrange (Gain) %>% pull (Feature)) +
        theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
        xlab ("Importance") + ylab ("") +
        facet_wrap (~ type)

plot_importance_xgboost

df_importance_xgb %>%
    arrange (desc (Gain)) %>%
    mutate_if (is.numeric, round, digits = 2) %>%
    kable ()
Feature Gain Cover Frequency type
app.mthd 1.00 0.01 0.02 B
tan.app 0.99 0.06 0.12 B
man.ph 0.75 0.12 0.06 B
incorp 0.44 0.00 0.02 B
man.dm 0.28 0.06 0.06 B
app.rate 0.16 0.03 0.05 B
time 0.14 0.18 0.24 B
temp_1 0.13 0.05 0.04 B
temp_2 0.07 0.05 0.04 B
temp_4 0.06 0.01 0.01 B
wind_2 0.06 0.04 0.03 B
wind_3 0.06 0.01 0.02 B
wind_6 0.05 0.04 0.04 B
rain_6 0.05 0.01 0.03 B
wind_1 0.05 0.09 0.05 B
temp_6 0.04 0.02 0.03 B
temp_3 0.03 0.02 0.02 B
temp_5 0.02 0.01 0.01 B
rain_4 0.02 0.06 0.02 B
t.incorp 0.01 0.00 0.00 B
rain_1 0.01 0.06 0.03 B
rain_2 0.01 0.01 0.02 B
rain_5 0.01 0.01 0.00 B
rain_3 0.01 0.01 0.01 B
wind_4 0.01 0.01 0.01 B
wind_5 0.00 0.01 0.01 B
man.source 0.00 0.00 0.00 B

Lasso

set.seed (123)
lasso_model = cv.glmnet (  

    x = data_lasso %>% 
        select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
        as.matrix %>%
        {.}, 

    y = data_lasso %>% 
        select (e.cum) %>% 
        as.matrix %>%
        {.},

    alpha = 1

)

Part 3 - Testing scenarios

load (file = "scripts/processed_data/data_scenarios.Rdata")

data_scenarios = data_scenarios %>%

    mutate (t.incorp = as.factor (t.incorp)) %>%
    mutate (t.incorp = recode (t.incorp, "1000" = "-"))

Random forest

load (file = "scripts/processed_data/data_scenarios_random_forest.Rdata")

data_tmp = data_scenarios_random_forest

random_forest_predictions_scenarios_vector = predict (
        random_forest_model,       
        newdata = rbind (data_random_forest_calibration[1, -1], data_tmp)[- 1, ]
)

data_scenarios = data_scenarios %>% 

    mutate (e.cum_hat_random_forest = random_forest_predictions_scenarios_vector, .before = time) %>%

    mutate (efficacy_random_forest = ((e.cum_hat_random_forest / e.cum_hat_random_forest[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100, 
            .by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain)) %>% 

    {.}

Xgboost

load (file = "scripts/processed_data/data_scenarios_xgboost.Rdata")

xgboost_predictions_scenarios_vector = predict (
        
    xgboost_model, 

    data_scenarios_xgboost %>% as.matrix
)

data_scenarios = data_scenarios %>% 

    mutate (e.cum_hat_xgboost = xgboost_predictions_scenarios_vector, .before = time) %>%

    mutate (efficacy_xgboost = ((e.cum_hat_xgboost / e.cum_hat_xgboost[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100, 
            .by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))

Lasso

load (file = "scripts/processed_data/data_scenarios_lasso.Rdata")

lasso_predictions_scenarios_vector = predict (
        
    lasso_model, 

    data_scenarios_lasso %>% as.matrix
)


data_scenarios = data_scenarios %>% 

    mutate (e.cum_hat_lasso = lasso_predictions_scenarios_vector, .before = time) %>%

    mutate (efficacy_lasso = ((e.cum_hat_lasso / e.cum_hat_lasso[app.mthd == "bc" & incorp == "none"]) - 1) * 100, 
            .by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))

ALFAM2

load (file = "scripts/processed_data/data_scenarios_alfam2.Rdata")

alfam2_predictions_scenarios_vector =  alfam2 (
        pars = alfam2pars01,
        dat = data_scenarios_alfam2,
        app.name = "tan.app",
        time.name = "time",
        time.incorp = "t.incorp",
        group = "pmid",
        prep = TRUE,
        warn = FALSE
)  %>%

    filter (time == max (time), .by = pmid) %>%
    select (pmid, e.cum_hat_alfam2 = e)

data_scenarios = data_scenarios %>%

    left_join (alfam2_predictions_scenarios_vector, by = "pmid") %>%

    mutate (efficacy_alfam2 = ((e.cum_hat_alfam2 / e.cum_hat_alfam2[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100, 
            .by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))

Comparison

plot_scenario_predictions = data_scenarios %>% 
    select (app.mthd, incorp, t.incorp, man.ph,
            efficacy_random_forest, efficacy_xgboost, efficacy_lasso, efficacy_alfam2) %>%

    rename (efficacy_rforest = efficacy_random_forest) %>%

    pivot_longer (cols = c (5 : 8), names_to = c (".value", "method"), names_pattern = "(.+)_(.+)") %>%
    mutate (method = recode (method, "rforest" = "random forest")) %>%

    mutate (method = recode (method, "alfam2" = "A", "lasso" = "B", "random forest" = "C", "xgboost" = "D")) %>%

    mutate (group = paste (app.mthd, incorp, t.incorp)) %>%
    filter (! (app.mthd == "bc" & incorp == "none")) %>%

    mutate (group = recode (group,
                            "bc shallow 0" = "incorporation",
                            "bsth none -" = "trailing hoses",
                            "os none -" = "open slot",
                            "ts none -" = "trailing shoes")) %>%
    
    ggplot () +
        
        geom_boxplot (aes (x = efficacy, y = group, fill = group)) +
        scale_fill_manual (values = Dark2[c(2 : 5)]) +
        scale_y_discrete (limits = c ("open slot", "trailing shoes", "trailing hoses", "incorporation")) +
        ylab ("") + xlab ("Efficacy") +
        theme (legend.position = "none",
               axis.ticks.y = element_blank(),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
        facet_wrap (~ method, ncol = 1)

plot_scenario_predictions

Scenarios for which xgboost predicted a positive efficacy:

data_scenarios %>%
    filter (efficacy_xgboost > 0) %>%
    select (efficacy_xgboost, tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
    kable ()
efficacy_xgboost tan.app app.mthd app.rate man.source incorp group_temp group_wind group_rain
8.9906385 80.3025 ts 36.650 pig none q1 q1 q1
9.0057694 80.3025 ts 36.650 cat none q1 q1 q1
0.5613908 80.3025 bsth 18.725 pig none q1 q1 q2
0.5621355 80.3025 bsth 18.725 cat none q1 q1 q2
25.0445690 80.3025 bsth 36.650 pig none q1 q1 q2
18.2393264 80.3025 os 36.650 pig none q1 q1 q2
32.8066167 80.3025 ts 36.650 pig none q1 q1 q2
25.0855518 80.3025 bsth 36.650 cat none q1 q1 q2
18.2691620 80.3025 os 36.650 cat none q1 q1 q2
32.8602812 80.3025 ts 36.650 cat none q1 q1 q2
30.7067041 80.3025 bsth 36.650 pig none q1 q2 q2
23.4712954 80.3025 os 36.650 pig none q1 q2 q2
25.4853738 80.3025 ts 36.650 pig none q1 q2 q2
30.7704933 80.3025 bsth 36.650 cat none q1 q2 q2
23.5200540 80.3025 os 36.650 cat none q1 q2 q2
25.5383165 80.3025 ts 36.650 cat none q1 q2 q2
4.5797139 36.6595 ts 36.650 pig none q2 q1 q1
4.5901176 36.6595 ts 36.650 cat none q2 q1 q1
11.1164005 80.3025 ts 36.650 pig none q2 q1 q1
11.4099749 80.3025 ts 36.650 cat none q2 q1 q1
9.3674813 36.6595 ts 36.650 pig none q2 q1 q2
9.3912529 36.6595 ts 36.650 cat none q2 q1 q2
10.4259527 80.3025 bsth 18.725 pig none q2 q1 q2
7.6296711 80.3025 os 18.725 pig none q2 q1 q2
14.5180492 80.3025 ts 18.725 pig none q2 q1 q2
10.6716109 80.3025 bsth 18.725 cat none q2 q1 q2
7.8093910 80.3025 os 18.725 cat none q2 q1 q2
14.8600869 80.3025 ts 18.725 cat none q2 q1 q2
15.0486933 80.3025 bsth 36.650 pig none q2 q1 q2
11.4423307 80.3025 os 36.650 pig none q2 q1 q2
32.0733336 80.3025 ts 36.650 pig none q2 q1 q2
15.4434423 80.3025 bsth 36.650 cat none q2 q1 q2
11.7424345 80.3025 os 36.650 cat none q2 q1 q2
32.9145756 80.3025 ts 36.650 cat none q2 q1 q2
8.9087332 80.3025 ts 36.650 pig none q2 q2 q1
9.0807629 80.3025 ts 36.650 cat none q2 q2 q1
2.5264251 80.3025 bsth 18.725 pig none q2 q2 q2
0.7573102 80.3025 os 18.725 pig none q2 q2 q2
2.5725532 80.3025 bsth 18.725 cat none q2 q2 q2
0.7710910 80.3025 os 18.725 cat none q2 q2 q2
30.5988491 80.3025 bsth 36.650 pig none q2 q2 q2
27.8218260 80.3025 os 36.650 pig none q2 q2 q2
33.8994959 80.3025 ts 36.650 pig none q2 q2 q2
31.3037845 80.3025 bsth 36.650 cat none q2 q2 q2
28.4628769 80.3025 os 36.650 cat none q2 q2 q2
34.6804699 80.3025 ts 36.650 cat none q2 q2 q2
data_scenarios %>%
    filter (efficacy_xgboost > 0) %>%
    select (tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
    lapply (table)
$tan.app

36.6595 80.3025 
      4      42 

$app.mthd

bsth   os   ts 
  14   12   20 

$app.rate

18.725  36.65 
    12     34 

$man.source

cat pig 
 23  23 

$incorp

none 
  46 

$group_temp

q1 q2 
16 30 

$group_wind

q1 q2 
28 18 

$group_rain

q1 q2 
 8 38 
data_tmp = data_scenarios %>%
    filter (efficacy_xgboost > 0) %>%
    select (efficacy_xgboost, tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
    mutate (group_rain = ifelse (group_rain == "q1", "low", "high")) %>%
    mutate (tan.app = paste ("TAN = ", round (tan.app, digits = 1), " kg/ha", sep = ""),
            app.rate = paste ("Application rate = ", round (app.rate, digits = 1), " t/ha or m3/ha", sep = ""),
            group_rain = paste ("Rain = ", group_rain))

plot = ggplot (data_tmp) +
    geom_histogram (aes (efficacy_xgboost)) +
    facet_wrap (~ tan.app + app.rate + group_rain) +
    xlab ("Efficacy") +
    ylab ("Number of predictions") +
    theme (axis.title.y = element_text (margin = ggplot2::margin (r = 20), size = 18),
           strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)),
           strip.text = element_text (size = 18),
           axis.text = element_text (size = 18),
           axis.title.x = element_text (size = 18))

plot

data_scenarios %>%
  filter (! (app.mthd == "bc" & incorp == "none")) %>%
  summarise (mean_rf = mean (efficacy_random_forest), 
             mean_alfam2 = mean (efficacy_alfam2), 
             mean_gbf = mean (efficacy_xgboost), 
             mean_lasso = mean (efficacy_lasso), .by = c ('app.mthd')) %>%
  mutate_if (is.numeric, round, digits = 1) %>%
  kable()
app.mthd mean_rf mean_alfam2 mean_gbf mean_lasso
bsth -52.1 -34.3 -38.6 -23.7
os -52.8 -61.8 -41.9 -42.5
ts -50.9 -34.3 -35.0 -23.7
bc -16.6 -43.1 -34.7 -22.0
data_scenarios %>%
  filter (! (app.mthd == "bc" & incorp == "none")) %>%
  summarise (median_rf = median (efficacy_random_forest), 
             median_alfam2 = median (efficacy_alfam2), 
             median_gbf = median (efficacy_xgboost), 
             median_lasso = median (efficacy_lasso), .by = c ('app.mthd')) %>%
  mutate_if (is.numeric, round, digits = 1) %>%
  kable()
app.mthd median_rf median_alfam2 median_gbf median_lasso
bsth -53.5 -34.1 -41.2 -23.4
os -54.1 -61.7 -42.6 -42.0
ts -52.6 -34.1 -35.4 -23.4
bc -13.6 -43.7 -34.6 -21.7

Figures

legend = cowplot::get_legend (plot_shap_rf)

plot_shap_rf_2 = plot_shap_rf +
    annotate (geom = "label", label = "A", size = 22, fontface = "bold", x = 45, y = 1.5, fill = "#000080", color = "white") +
    xlim (c (- 18, 45)) +
    theme (legend.position = "none", 
          plot.margin = unit (c (0, 0, 1, 0), "cm"),
          axis.text.x = element_blank(),
          axis.title.x = element_blank(),
          axis.ticks.x = element_blank())

plot_shap_xgboost_2 = plot_shap_xgboost +
    annotate (geom = "label", label = "B", size = 22, fontface = "bold", x = 45, y = 1.5, fill = "#000080", color = "white") +
    xlim (c (- 18, 45)) + 
    theme (legend.position = "none", 
          plot.margin = unit (c (0, 0, 1, 0), "cm"),
          axis.title.x = element_text (margin = ggplot2::margin (t = 10, b = 20)))


png (file = "figures/plot_shap.png", width = 1000, height = 900)
do.call ("grid.arrange", 
         c (list(plot_shap_rf_2, 
                 plot_shap_xgboost_2, 
                 legend), 
            list(layout_matrix = as.matrix(c (1, 2, 3)),
                  heights = c (0.8, 1, 0.2))))
dev.off()
png 
  2 
png (file = "figures/plot_importance.png", width = 1300, height = 800)
grid.arrange (plot_importance_rf, plot_importance_xgboost, nrow = 1)
dev.off()
png 
  2 

Supplementary material

data = read_excel("data/Peng_Xu_et_al_2024/Supplementary_Table_3_use2775.xlsx", sheet = 2)
head (data) %>% select (- Title) %>% kable()
No. Author Language Year Replicates Fertilizer type Nitrogen placement Fertilizer application time Soil tillage practices Crop type Water input Tem SOC TN pH BD Clay CEC Nrate Erate
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 113 22.1
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 150 32.7
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 188 35.9
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 225 44.6
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 300 63.1
2 Zhu et al. (2013) Chinese 2012 3 Others Mix 2 CT Rice 1500 25.1 21.9 1.9 5.8 1.1 38.3 11.4 113 38.3
plot_temperature_comparison = rbind (
    data %>% select (temp = Tem) %>% mutate (data = "Data from Peng Xu et al (2024)"),
    data_alfam2 %>% select (temp = air.temp) %>% mutate (data = "Data used for the alfam2 model")
) %>%

    ggplot () +
        geom_histogram (aes (x = temp)) +
        facet_wrap (~ data, ncol = 1, scales = "free") +
        theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))+
        xlab ("Air temperature (°C)") +
        ylab ("Number of observations")

plot_temperature_comparison

rbind (
    data %>% select (temp = Tem) %>% mutate (data = "Data from Peng Xu et al (2024)"),
    data_alfam2 %>% select (temp = air.temp) %>% mutate (data = "Data used for the alfam2 model")
) %>%
  
  summarise (mean_temp = mean (temp), sd_temp = sd (temp), .by = data)
# A tibble: 2 × 3
  data                           mean_temp sd_temp
  <chr>                              <dbl>   <dbl>
1 Data from Peng Xu et al (2024)      20.7    6.86
2 Data used for the alfam2 model      13.3    5.48